mental_df =
read_csv("./data/mental_data.csv") %>%
janitor::clean_names() %>%
mutate(
any_mental_num = any_mental_num / 1000000,
any_mental_per = any_mental_per * 100,
ser_mental_num = ser_mental_num / 1000000,
ser_mental_per = ser_mental_per * 100,
state_abb = state.abb[match(state, state.name)],
region = state.region[match(state, state.name)]
) %>%
mutate(
state_abb = replace(state_abb, state == "District of Columbia", "DC"))
## Rows: 51 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): state
## dbl (9): any_mental_per, any_mental_num, ser_mental_per, ser_mental_num, tee...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
state_mental=
plot_usmap(
data = mental_df,
regions = "state",
values = "any_mental_per",
labels = TRUE, label_color = "white") +
labs(
title = "Percent of adults reporting any mental illness for each state, 2019-2022"
) +
scale_fill_continuous(
name = "Mental illness percent (%)",
label = scales::comma) +
theme(legend.position = "right")
ggplotly(state_mental)
According to the mental health data collected between 2019 -2020, the mental illness percents are high in the US overal, with variations between states.
any_mental_plot =
mental_df %>%
group_by(region) %>%
drop_na() %>%
summarize(any_mental_num = sum(any_mental_num)) %>%
ggplot(
aes(x = region, y = any_mental_num, fill = region)) +
geom_bar(stat = "identity") +
labs(
title = "Any Mental Illness Number, by Region, 2019-2020",
x = "Region",
y = "Mental illness number (million)",
fill = "Region") +
theme(legend.position = "bottom")
ser_mental_plot =
mental_df %>%
group_by(region) %>%
drop_na() %>%
summarize(ser_mental_num = sum(ser_mental_num)) %>%
ggplot(
aes(x = region, y = ser_mental_num, fill = region)) +
geom_bar(stat = "identity") +
labs(
title = "Serious Mental Illness Number, by Region, 2019-2020",
x = "Region",
y = "Mental illness number (million)",
fill = "Region") +
theme(legend.position = "bottom")
grid.arrange(any_mental_plot, ser_mental_plot, ncol =2)
Comment Both any mental illness and serious mental illness are highest in the South, lowest in the northeast.
any_top10_plot =
mental_df %>%
filter(row_number(desc(any_mental_per)) <= 10) %>%
mutate(
state = fct_reorder(state, any_mental_per)
) %>%
ggplot(
aes(x = any_mental_per, y = state, fill = state)) +
geom_bar(stat = "identity") +
labs(
title = "Any Mental Illness Percent, Top 10 States",
x = "Any Mental illness percent (%)",
y = "State",
fill = "State") +
theme(legend.position = "bottom")
ser_top10_plot =
mental_df %>%
filter(row_number(desc(ser_mental_per)) <= 10) %>%
mutate(
state = fct_reorder(state, ser_mental_per)
) %>%
ggplot(
aes(x = ser_mental_per, y = state, fill = state)) +
geom_bar(stat = "identity") +
labs(
title = "Serious Mental Illness Percent, Top 10 States",
x = "Serious mental illness percent (%)",
y = "State",
fill = "State") +
theme(legend.position = "bottom")
grid.arrange(any_top10_plot, ser_top10_plot, ncol =2)
Comment
The top 10 states for any and serious mental illness are 8/10 the same, except Washington, Rhode Island, Arkansas and Indiana. Ultah has the highest any/serious mental illness percent.
data cleaning
nhis =
read_csv("data/nhis_data01.csv") %>%
janitor::clean_names() %>%
mutate(
worrx = recode_factor(worrx,'1' = "no", '2' = "yes"),
worfreq = recode_factor(worfreq, '1' = "Daily", '2' = "Weekly",
'3' = "Monthly", '4' = "A few times a year",
'5' = "Never"),
worfeelevl = recode_factor(worfeelevl, '1' = "A lot",
'3' = "Somewhere between a little and a lot",
'2' = "A little"),
deprx = recode_factor(deprx, '1' = "no", '2' = "yes"),
depfreq = recode_factor(depfreq, '1' = "Daily", '2' = "Weekly",
'3' = "Monthly", '4' = "A few times a year",
'5' = "Never"),
depfeelevl = recode_factor(depfeelevl, '1' = "A lot",
'3' = "Somewhere between a little and a lot",
'2' = "A little")
)
## Rows: 468212 Columns: 33
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): NHISHID, NHISPID, HHX, FMX, PX
## dbl (28): YEAR, SERIAL, STRATA, PSU, HHWEIGHT, PERNUM, PERWEIGHT, SAMPWEIGHT...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
recent_yr_df =
nhis %>%
filter(year>2019)
number of people reported taken medication for worried, nervous, or anxious feeings from 2015 to 2021.
nhis %>%
filter(year>=2015) %>%
drop_na(worrx) %>%
group_by(year, worrx) %>%
summarize(wor_num = n()) %>%
pivot_wider(
names_from = worrx,
values_from = wor_num
) %>%
mutate(
wor_percentage = yes/(no + yes)*100,
text_label = str_c(yes, " out of ", no + yes)
) %>%
plot_ly(
y = ~wor_percentage,
x = ~year,
color = ~year,
type = "bar",
colors = "viridis",
text = ~text_label) %>%
layout(title = "Percentage of people reported taken medication for worried, nervous, or anxious feeings each year",
xaxis = list (title = ""),
yaxis = list (title = "Percentage")
) %>%
hide_colorbar()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
Frequency of worries
nhis %>%
filter(year>=2015) %>%
drop_na(worfreq) %>%
group_by(year, worfreq) %>%
summarize(count = n()) %>%
group_by(year) %>%
summarize(
percentage=100 * count/sum(count),
sum_count = sum(count),
worfreq = worfreq,
count=count
) %>%
mutate(
text_label = str_c(count, " out of ", sum_count)
) %>%
plot_ly(
y = ~percentage,
x = ~year,
color = ~worfreq,
type = "bar",
colors = "viridis",
text = ~text_label
) %>%
layout(
xaxis = list (title = ""),
yaxis = list (title = "Percentage"),
barmode = 'stack',
legend = list(orientation = 'h')
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
worry level distribution.
nhis %>%
filter(year>=2015) %>%
drop_na(worfeelevl) %>%
group_by(year, worfeelevl) %>%
summarize(count = n()) %>%
group_by(year) %>%
summarize(
percentage=100 * count/sum(count),
sum_count = sum(count),
worfeelevl = worfeelevl,
count=count
) %>%
mutate(
text_label = str_c(count, " out of ", sum_count)
) %>%
plot_ly(
y = ~percentage,
x = ~year,
color = ~worfeelevl,
type = "bar",
colors = "viridis",
text = ~text_label
) %>%
layout(
title = "Distribution of worry levels",
xaxis = list (title = ""),
yaxis = list (title = "Percentage"),
barmode = 'stack',
legend = list(orientation = 'h')
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
logistic regression
whether taken medication for worried, nervous, or anxious feelings is associated with covid-19 adjusting for sex and family income level.
glm(worrx ~ sex + poverty + cvddiag,
family=binomial(link='logit'),
data = recent_yr_df) %>%
broom::tidy(conf.int = TRUE) %>%
mutate(
odds_ratio = exp(estimate),
low = exp(conf.low),
high = exp(conf.high)
) %>%
select(term, odds_ratio, low, high) %>%
knitr::kable(digits=2)
| term | odds_ratio | low | high |
|---|---|---|---|
| (Intercept) | 0.08 | 0.07 | 0.09 |
| sex | 2.09 | 1.98 | 2.20 |
| poverty | 0.98 | 0.98 | 0.99 |
| cvddiag | 1.07 | 1.03 | 1.12 |
number of people reported taken medication for depression from 2015 to 2021.
nhis %>%
filter(year>=2015) %>%
drop_na(deprx) %>%
group_by(year, deprx) %>%
summarize(dep_num = n()) %>%
pivot_wider(
names_from = deprx,
values_from = dep_num
) %>%
mutate(
dep_percentage = yes/(no + yes)*100,
text_label = str_c(yes, " out of ", no + yes)
) %>%
plot_ly(
y = ~dep_percentage,
x = ~year,
color = ~year,
type = "bar",
colors = "viridis",
text = ~text_label) %>%
layout(title = "Percentage of people reported taken medication for depression each year",
xaxis = list (title = ""),
yaxis = list (title = "Percentage")
) %>%
hide_colorbar()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
Frequency of feel depressed
nhis %>%
filter(year>=2015) %>%
drop_na(depfreq) %>%
group_by(year, depfreq) %>%
summarize(count = n()) %>%
group_by(year) %>%
summarize(
percentage=100 * count/sum(count),
sum_count = sum(count),
depfreq = depfreq,
count=count
) %>%
mutate(
text_label = str_c(count, " out of ", sum_count)
) %>%
plot_ly(
y = ~percentage,
x = ~year,
color = ~depfreq,
type = "bar",
colors = "viridis",
text = ~text_label
) %>%
layout(
title = "Distribution of frequency of feel depressed",
xaxis = list (title = ""),
yaxis = list (title = "Percentage"),
barmode = 'stack',
legend = list(orientation = 'h')
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
depression level distribution.
nhis %>%
filter(year>=2015) %>%
drop_na(depfeelevl) %>%
group_by(year, depfeelevl) %>%
summarize(count = n()) %>%
group_by(year) %>%
mutate(
percentage = 100 * count/sum(count),
sum_count = sum(count),
text_label = str_c(count, " out of ", sum_count)
) %>%
plot_ly(
y = ~percentage,
x = ~year,
color = ~depfeelevl,
type = "bar",
colors = "viridis",
text = ~text_label
) %>%
layout(
title = "Distribution of depression levels",
xaxis = list (title = ""),
yaxis = list (title = "Percentage"),
barmode = 'stack',
legend = list(orientation = 'h')
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
logistic regression
whether taken medication for depression is associated with covid-19 adjusting for sex and family income level.
glm(deprx ~ sex + poverty + cvddiag,
family=binomial(link='logit'),
data = recent_yr_df) %>%
broom::tidy(conf.int = TRUE) %>%
mutate(
odds_ratio = exp(estimate),
low = exp(conf.low),
high = exp(conf.high)
) %>%
select(term, odds_ratio, low, high) %>%
knitr::kable(digits=2)
| term | odds_ratio | low | high |
|---|---|---|---|
| (Intercept) | 0.08 | 0.07 | 0.09 |
| sex | 2.08 | 1.97 | 2.20 |
| poverty | 0.97 | 0.97 | 0.98 |
| cvddiag | 1.05 | 1.00 | 1.10 |
suicide_df =
read_excel(
"./data/suicide_data.xlsx",
sheet = 1,
range = "A1:D351",
skip = 1,
col_names = TRUE) %>%
janitor::clean_names()
suicide_df %>%
group_by(year) %>%
summarize(deaths = sum(deaths)) %>%
ggplot(aes(x = year, y = deaths)) +
geom_line(col = "deepskyblue3", size = 1) +
geom_point(col = "deepskyblue3", size = 2) +
labs(
title = "Suicide Death Toll",
x = "Year",
y = "Suicide death") +
scale_x_continuous(breaks = seq(2014, 2020, 1)) +
scale_y_continuous(breaks = seq(42000,49000, 1000))
Comment: The peak suicide death number was about 48500 per 100K in 2018.
suicide_df %>%
ggplot(aes(x = state, y = death_rate, color = year)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(
title = "Suicide Death Rate for Each State, 2014-2020",
x = "State",
y = "Suicide death rate per 100K")